epl_data <- read.csv("/Users/ryanlee/Desktop/R/R Projects/Football/dataset/epl.csv")
ycard_data <- epl_data |>
select(HomeTeam, AwayTeam, Referee, HY, AY)
rcard_data <- epl_data %>%
select(HomeTeam, AwayTeam, Referee, HR, AR)
epl_data |>
select(c(FTHG, FTAG, HTHG, HTAG)) |>
ggpairs(cardinality_threshold = 50, progress = FALSE)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

referee_counts <- table(epl_data$Referee)
# Get a list of qualified referees with 20 or more games
qualified_referees <- names(referee_counts[referee_counts >= 20])
# Filter the epl_data data frame to only include games refereed by qualified referees
filtered_epl_data <- epl_data[epl_data$Referee %in% qualified_referees, ]
epl_data <- filtered_epl_data
head(epl_data)
## Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR Referee HS
## 1 16/08/14 Arsenal Crystal Palace 2 1 H 1 1 D J Moss 14
## 2 16/08/14 Leicester Everton 2 2 D 1 2 A M Jones 11
## 3 16/08/14 Man United Swansea 1 2 A 0 1 A M Dean 14
## 4 16/08/14 QPR Hull 0 1 A 0 0 D C Pawson 19
## 5 16/08/14 Stoke Aston Villa 0 1 A 0 0 D A Taylor 12
## 6 16/08/14 West Brom Sunderland 2 2 D 1 1 D N Swarbrick 10
## AS HST AST HF AF HC AC HY AY HR AR
## 1 4 6 2 13 19 9 3 2 2 0 1
## 2 13 3 3 16 10 3 6 1 1 0 0
## 3 5 5 4 14 20 4 0 2 4 0 0
## 4 11 6 4 10 10 8 9 1 2 0 0
## 5 7 2 2 14 9 2 8 0 3 0 0
## 6 7 5 2 18 9 6 3 3 1 0 0
skim(epl_data)
Data summary
| Name |
epl_data |
| Number of rows |
1857 |
| Number of columns |
22 |
| _______________________ |
|
| Column type frequency: |
|
| character |
6 |
| numeric |
16 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| Date |
0 |
1 |
6 |
10 |
0 |
515 |
0 |
| HomeTeam |
0 |
1 |
3 |
14 |
0 |
29 |
0 |
| AwayTeam |
0 |
1 |
3 |
14 |
0 |
29 |
0 |
| FTR |
0 |
1 |
1 |
1 |
0 |
3 |
0 |
| HTR |
0 |
1 |
1 |
1 |
0 |
3 |
0 |
| Referee |
0 |
1 |
6 |
13 |
0 |
20 |
0 |
Variable type: numeric
| FTHG |
0 |
1 |
1.53 |
1.30 |
0 |
1 |
1 |
2 |
8 |
▇▅▁▁▁ |
| FTAG |
0 |
1 |
1.18 |
1.16 |
0 |
0 |
1 |
2 |
7 |
▇▂▂▁▁ |
| HTHG |
0 |
1 |
0.68 |
0.84 |
0 |
0 |
0 |
1 |
5 |
▇▁▁▁▁ |
| HTAG |
0 |
1 |
0.53 |
0.74 |
0 |
0 |
0 |
1 |
4 |
▇▃▁▁▁ |
| HS |
0 |
1 |
14.10 |
5.72 |
0 |
10 |
13 |
17 |
43 |
▂▇▃▁▁ |
| AS |
0 |
1 |
11.25 |
4.74 |
0 |
8 |
11 |
14 |
30 |
▂▇▅▁▁ |
| HST |
0 |
1 |
4.70 |
2.66 |
0 |
3 |
4 |
6 |
17 |
▇▇▃▁▁ |
| AST |
0 |
1 |
3.84 |
2.24 |
0 |
2 |
4 |
5 |
15 |
▇▆▂▁▁ |
| HF |
0 |
1 |
10.53 |
3.41 |
0 |
8 |
10 |
13 |
24 |
▁▆▇▂▁ |
| AF |
0 |
1 |
11.06 |
3.50 |
1 |
9 |
11 |
13 |
26 |
▂▇▆▁▁ |
| HC |
0 |
1 |
5.82 |
3.12 |
0 |
4 |
5 |
8 |
19 |
▅▇▃▁▁ |
| AC |
0 |
1 |
4.69 |
2.67 |
0 |
3 |
4 |
6 |
15 |
▇▇▃▁▁ |
| HY |
0 |
1 |
1.57 |
1.23 |
0 |
1 |
1 |
2 |
7 |
▇▅▃▁▁ |
| AY |
0 |
1 |
1.75 |
1.28 |
0 |
1 |
2 |
3 |
9 |
▇▇▂▁▁ |
| HR |
0 |
1 |
0.06 |
0.24 |
0 |
0 |
0 |
0 |
2 |
▇▁▁▁▁ |
| AR |
0 |
1 |
0.08 |
0.27 |
0 |
0 |
0 |
0 |
2 |
▇▁▁▁▁ |
epl_data |>
ggplot() +
geom_point(aes(x = HTAG, y = FTAG)) +
geom_smooth(aes(x = HTAG, y = FTAG), se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Failed to fit group -1.
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

ref_cards <- epl_data |>
group_by(Referee) |>
transmute("YellowCard" = HY + AY, "RedCard" = HR + AR) |>
summarise(yellow = sum(YellowCard) / n(), red = sum(RedCard) / n()) |>
pivot_longer(cols = c(yellow, red), names_to = "card", values_to = "freq")
ref_cards
## # A tibble: 40 × 3
## Referee card freq
## <chr> <chr> <dbl>
## 1 A Marriner yellow 3.27
## 2 A Marriner red 0.110
## 3 A Taylor yellow 3.55
## 4 A Taylor red 0.107
## 5 C Kavanagh yellow 3.28
## 6 C Kavanagh red 0.1
## 7 C Pawson yellow 3.44
## 8 C Pawson red 0.195
## 9 G Scott yellow 2.58
## 10 G Scott red 0.12
## # ℹ 30 more rows
ggplot(ref_cards, aes(Referee, freq, fill = card)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
scale_fill_manual(values = c("red" = "red", "yellow" = "yellow")) +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Expected Cards per Match vs Referee", y = "Expected Cards per Match", fill = "Card")

# Create a new dataframe with only the relevant columns
card_data <- epl_data |>
select(HomeTeam, AwayTeam, Referee, HY, AY)
# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |>
transmute(team = HomeTeam, opp_team = AwayTeam, cards = HY, Referee) |>
mutate(location = "home")
away_cards <- card_data |>
transmute(team = AwayTeam, opp_team = HomeTeam, cards = AY, Referee) |>
mutate(location = "away")
# Combine the two groups into a single dataframe
all_cards <- rbind(home_cards, away_cards)
# Create a new column for the total number of cards given to each team by each referee
all_cards <- all_cards |>
group_by(team, Referee) |>
summarize(total_cards = sum(cards)) |>
filter(!is.na(total_cards)) |>
ungroup()
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
# Create a new column for the total number of cards given to each team by all referees
team_totals <- all_cards |>
group_by(team) |>
summarize(total_cards = sum(total_cards)) |>
filter(!is.na(total_cards)) |>
ungroup()
# Calculate the percentage of cards given to the home team by each referee
card_percentages <- all_cards |>
left_join(team_totals, by = "team") |>
mutate(card_percentage = total_cards.x / total_cards.y) |>
select(-total_cards.x, -total_cards.y)
# Create a heat map for each home team
x <- "Man United"
team_cards <- card_percentages |>
filter(team == x)
ggplot(team_cards, aes(x = Referee, y = card_percentage)) +
geom_tile(aes(fill = card_percentage), color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Home Team:", x), x = "Referee", y = "Ratio of Cards to Home Team") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Create separate dataframes for home and away cards
home_cards <- ycard_data |>
mutate(location = "home") |>
select(HomeTeam, Referee, HY, location) |>
rename(team = HomeTeam, cards = HY)
away_cards <- ycard_data |>
mutate(location = "away") |>
select(AwayTeam, Referee, AY, location) |>
rename(team = AwayTeam, cards = AY)
# Combine the dataframes
all_cards <- rbind(home_cards, away_cards)
# Calculate total cards for each team and each referee
team_totals <- all_cards |>
group_by(Referee, location) |>
summarize(total_cards = sum(cards) / n()) |>
filter(!is.na(total_cards)) |>
ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Calculate percentage of cards for each team and each referee
card_percentages <- team_totals |>
group_by(Referee, location) |>
summarize(card_percentage = mean(total_cards)) |>
ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Plot yellow card percentages for home vs. away
ggplot(team_totals |> group_by(Referee, location), aes(x = Referee, y = total_cards, fill = location)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("home" = "blue", "away" = "red")) +
labs(title="Yellow Card Per Match vs. Referee", x = "Referee", y = "Yellow Card Per Match", fill = "") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Create separate dataframes for home and away cards
home_cards <- rcard_data |>
mutate(location = "home") |>
select(HomeTeam, Referee, HR, location) |>
rename(team = HomeTeam, cards = HR)
away_cards <- rcard_data |>
mutate(location = "away") |>
select(AwayTeam, Referee, AR, location) |>
rename(team = AwayTeam, cards = AR)
# Combine the dataframes
all_cards <- rbind(home_cards, away_cards)
# Calculate total cards for each team and each referee
team_totals <- all_cards |>
group_by(Referee, location) |>
summarize(total_cards = sum(cards) / n()) |>
filter(!is.na(total_cards)) |>
ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# calculate percentage of cards for each team and each referee
card_percentages <- team_totals |>
group_by(Referee, location) |>
summarize(card_percentage = mean(total_cards)) |>
ungroup()
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Plot yellow card percentages for home vs. away
ggplot(team_totals |> group_by(Referee, location), aes(x = Referee, y = total_cards, fill = location)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("home" = "blue", "away" = "red")) +
labs(title="Red Card Per Match vs. Referee", x = "Referee", y = "Red Card Per Match", fill = "") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Preprocess the data (updated)
epl_data_processed_yellow <- epl_data |>
group_by(Referee) |>
filter(!is.na(AY), !is.na(HY), !is.na(HF), !is.na(AF)) |>
summarise(Yellow = AY + HY, Fouls = HF + AF, Ratio = Yellow / Fouls)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Create the boxplot
ggplot(epl_data_processed_yellow, aes(x = Referee, y = Ratio, group = Referee)) +
geom_boxplot() +
labs(title = "Yellow Card Ratio vs. Referee", x = "Referee", y = "Yellow Cards per Foul", fill = "") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

# Preprocess the data (updated)
epl_data_processe_red <- epl_data |>
group_by(Referee) |>
filter(!is.na(AR), !is.na(HR), !is.na(HF), !is.na(AF)) |>
summarise(Yellow = AR + HR, Fouls = HF + AF, Ratio = Yellow / Fouls)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Create the boxplot
ggplot(epl_data_processe_red, aes(x = Referee, y = Ratio, group = Referee)) +
geom_boxplot() +
labs(title = "Red Card Percentage vs. Referee", x = "Referee", y = "Red Card Percentage", fill = "") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))

foul_referee <- epl_data |>
filter(!is.na(AY), !is.na(HY), !is.na(HF), !is.na(AF)) |>
group_by(Referee) |>
summarize(AF = sum(AF) / n(), HF = sum(HF) / n(), HY = sum(HY) / n(), AY = sum(AY) / n()) |>
pivot_longer(cols = c(AF, HF), names_to = "Location", values_to = "Fouls")
ggplot(foul_referee, aes(x = Referee, y = Fouls, fill = Location)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Fouls vs. Referee", x = "Referee", y = "Fouls Per Match") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_discrete(labels = c("Home Fouls", "Away Fouls"))

# Define the big 6 Premier League teams
big_6 <- c("Arsenal", "Chelsea", "Liverpool", "Man City", "Man United", "Tottenham")
# Filter the dataset for the big 6 teams and calculate the overall median of fouls
epl_data_big_6_yellow <- epl_data |>
filter(HomeTeam %in% big_6) |>
mutate(Team = as.factor(HomeTeam), Ratio = HY / HF)
epl_data_big6_yellow <- epl_data %>% mutate(Ratio = HY / HF)
overall_median_fouls_big6_yellow <- median(epl_data_big6_yellow$Ratio, na.rm = TRUE)
# Create box plots for each of the big 6 teams and add a horizontal line for the overall median of fouls
ggplot(epl_data_big_6_yellow, aes(x = Team, y = Ratio, fill = Team)) +
geom_boxplot() +
geom_hline(yintercept = overall_median_fouls_big6_yellow, linetype = "dashed", color = "blue", size = 1) +
labs(title = "Box Plots of Ratio of Yellow Cards to Fouls vs Home Team for the Big 6 Premier League Teams", x = "Home Team", y = "Ratio of Yellow Cards to Fouls", fill = "Team") +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

bad_6 <- c("Middlesbrough", "QPR", "Fulham", "Cardiff", "Wolves", "Huddersfield")
epl_data_bad_6_yellow <- epl_data |>
filter(HomeTeam %in% bad_6) |>
mutate(Team = as.factor(HomeTeam), Ratio = HY / HF)
epl_data_bad6_yellow <- epl_data |> mutate(Ratio = HY / HF)
overall_median_fouls_bad6_yellow <- median(epl_data_bad6_yellow$Ratio, na.rm = TRUE)
ggplot(epl_data_bad_6_yellow, aes(x = Team, y = Ratio, fill = Team)) +
geom_boxplot() +
geom_hline(yintercept = overall_median_fouls_bad6_yellow, linetype = "dashed", color = "blue", size = 1) +
labs(title = "Box Plots of Ratio of Yellow Cards to Fouls vs Home Team for the Bad 6 Premier League Teams", x = "Home Team", y = "Ratio of Yellow Cards to Fouls", fill = "Team") +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

big_6 <- c("Arsenal", "Chelsea", "Liverpool", "Man City", "Man United", "Tottenham")
# Filter the dataset for the big 6 teams and calculate the overall median of fouls
epl_data_big_6_red <- epl_data |>
filter(HomeTeam %in% big_6) |>
mutate(Team = as.factor(HomeTeam), Ratio = (HR / HF))
epl_data_big6_red <- epl_data %>% mutate(Ratio = HR / HF)
overall_median_fouls_big6_red <- median(epl_data_big6_red$Ratio, na.rm = TRUE)
# Create box plots for each of the big 6 teams and add a horizontal line for the overall median of fouls
ggplot(epl_data_big_6_red, aes(x = Team, y = Ratio, fill = Team)) +
geom_boxplot() +
geom_hline(yintercept = overall_median_fouls_big6_red, linetype = "dashed", color = "blue", size = 1) +
labs(title = "Box Plots of Ratio of Red Cards to Fouls vs Home Team for the Big 6 Premier League Teams", x = "Home Team", y = "Ratio of Red Cards to Fouls", fill = "Team") +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

bad_6 <- c("Middlesbrough", "QPR", "Fulham", "Cardiff", "Wolves", "Huddersfield")
epl_data_bad_6_red <- epl_data |>
filter(HomeTeam %in% bad_6) |>
mutate(Team = as.factor(HomeTeam), Ratio = (HR))
epl_data_bad6_red <- epl_data %>% mutate(Ratio = (HR))
overall_median_fouls_bad6_red <- median(epl_data_bad6_red$Ratio, na.rm = TRUE)
ggplot(epl_data_bad_6_red, aes(x = Team, y = Ratio, fill = Team)) +
geom_boxplot() +
geom_hline(yintercept = overall_median_fouls_bad6_red, linetype = "dashed", color = "blue", size = 1) +
labs(title = "Box Plots of Ratio of Red Cards to Fouls vs Home Team for the Bad 6 Premier League Teams", x = "Home Team", y = "Ratio of Red Cards to Fouls", fill = "Team") +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

ggplot(epl_data, aes(x = HomeTeam, y = HR / HF)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Red Cards and Fouls by Home Team", x = "Home Team", y = "Expected Red Cards per Match") +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

# Add yellow_per_game and red_per_game to the data frame
epl_data <- epl_data |>
group_by(HomeTeam) |>
mutate(
fouls_per_game = HF / n(),
yellow_per_game = HY / n(),
red_per_game = HR / n()
) |>
ungroup()
# Create a graph of the average fouls per match for each team, sorting the x axis by the average number of fouls
ggplot(epl_data, aes(x = reorder(HomeTeam, -fouls_per_game), y = fouls_per_game)) +
geom_bar(stat = "identity", position = "stack", fill = "blue") +
labs(title = "Fouls by Home Team", x = "Home Team", y = "Fouls per Match") +
scale_fill_manual(values = c("blue"), labels = c("Fouls")) +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Create a graph of the average yellow cards per match for each team
ggplot(epl_data, aes(x = reorder(HomeTeam, -yellow_per_game), y = yellow_per_game)) +
geom_bar(stat = "identity", position = "stack", fill = "yellow") +
labs(title = "Yellow Cards by Home Team", x = "Home Team", y = "Yellow Cards per Match") +
scale_fill_manual(values = c("yellow"), labels = c("Yellow Cards")) +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Create a graph of the average red cards per match for each team
ggplot(epl_data, aes(x = reorder(HomeTeam, -red_per_game), y = red_per_game)) +
geom_bar(stat = "identity", position = "stack", fill = "red") +
labs(title = "Red Cards by Home Team", x = "Home Team", y = "Red Cards per Match") +
scale_fill_manual(values = c("red"), labels = c("Red Cards")) +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

# Create a new dataframe with only the relevant columns
card_data <- epl_data |>
select(HomeTeam, AwayTeam, Referee, HY, AY)
# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |>
transmute(team=HomeTeam, Referee, HY) |>
group_by(Referee, team) |>
summarize(cards = sum(HY), totalx = n())
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
away_cards <- card_data |>
transmute(team=AwayTeam, Referee, AY) |>
group_by(Referee, team) |>
summarize(cards = sum(AY), totaly = n())
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = c("Referee", "team"))
# Calculate the average number of yellow cards per game for each referee
all_cards <- all_cards |>
mutate(total_games = ifelse(is.na(totalx), 0, totalx) + ifelse(is.na(totaly), 0, totaly),
total_cards = (ifelse(is.na(cards.x), 0, cards.x) + ifelse(is.na(cards.y), 0, cards.y)) / total_games) |> select(-totalx, -totaly, -cards.x, -cards.y)
referees <- unique(card_percentages$Referee)
# Create a graph for each referee
for (x in referees) {
team_cards <- all_cards |>
filter(Referee == x)
print(
ggplot(team_cards, aes(x = reorder(team, -total_cards), y = total_cards)) +
geom_bar(stat = "identity",fill="yellow") +
labs(title = paste("Referee:", x), x = "Team", y = "Average Yellow Cards per Game") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"), axis.text.x = element_text(angle = 90, hjust = 1))
)
}


























# Create a new dataframe with only the relevant columns
card_data <- epl_data |>
select(HomeTeam, AwayTeam, Referee, HR, AR)
# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |>
transmute(team = HomeTeam, Referee, HR) |>
group_by(Referee, team) |>
summarize(cards = sum(HR), totalx = n())
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
away_cards <- card_data |>
transmute(team=AwayTeam, Referee, AR) |>
group_by(Referee, team) |>
summarize(cards = sum(AR), totaly = n())
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = c("Referee", "team"))
# Calculate the average number of yellow cards per game for each Referee
all_cards <- all_cards |>
mutate(total_games = ifelse(is.na(totalx), 0,totalx)+ ifelse(is.na(totaly), 0, totaly),
total_cards = (ifelse(is.na(cards.x),0,cards.x) + ifelse(is.na(cards.y),0,cards.y)) / total_games) |>
select(-totalx, -totaly, -cards.x, -cards.y) # remove unnecessary columns
referees <- unique(card_percentages$Referee)
# Create a graph for each referee
for (x in referees) {
team_cards <- all_cards |>
filter(Referee == x)
print(
ggplot(team_cards, aes(x = reorder(team, -total_cards), y = total_cards)) +
geom_bar(stat = "identity",fill="red") +
labs(x = "Team", y = "Average Red Cards per Game", title = paste("Referee:", x)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
)
}


























# Create a new dataframe with only the relevant columns
card_data <- epl_data |>
select(HomeTeam, AwayTeam, Referee, HF, AF)
# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |>
transmute(team=HomeTeam, Referee, HF) |>
group_by(Referee, team) |>
summarize(cards = sum(HF), totalx = n())
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
away_cards <- card_data |>
transmute(team=AwayTeam, Referee, AF) |>
group_by(Referee, team) |>
summarize(cards = sum(AF), totaly = n())
## `summarise()` has grouped output by 'Referee'. You can override using the
## `.groups` argument.
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = c("Referee", "team"))
# Calculate the average number of yellow cards per game for each Referee
all_cards <- all_cards |>
mutate(total_games = ifelse(is.na(totalx), 0,totalx)+ ifelse(is.na(totaly), 0, totaly),
total_cards = (ifelse(is.na(cards.x),0,cards.x) + ifelse(is.na(cards.y),0,cards.y)) / total_games) |>
select(-totalx, -totaly, -cards.x, -cards.y)
referees <- unique(card_percentages$Referee)
# Create a graph for each referee
for (x in referees) {
team_cards <- all_cards |>
filter(Referee == x)
print(
ggplot(team_cards, aes(x = reorder(team, -total_cards), y = total_cards)) +
geom_bar(stat = "identity",fill="blue") +
labs(x = "Team", y = "Average Fouls per Game", title = paste("Referee:", x)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
)
}


























# Count the number of fouls, yellow cards, and red cards booked against the Big 6 and Bad 6 teams
big_6 <- c("Arsenal", "Chelsea", "Liverpool", "Man City", "Man United", "Tottenham")
bad_6 <- c("Middlesbrough", "QPR", "Fulham", "Cardiff", "Wolves", "Huddersfield")
# Create a new dataframe with only the relevant columns
card_data <- epl_data |>
select(HomeTeam, AwayTeam, HF, AF, HY, AY, HR, AR)
# Separate the data into two groups: cards given to the home team and cards given to the away team
home_cards <- card_data |>
transmute(team=HomeTeam, HF, HY, HR) |>
group_by(team) |>
summarize(home_fouls = sum(HF), home_yellow_cards = sum(HY), home_red_cards = sum(HR))
away_cards <- card_data |>
transmute(team=AwayTeam, AF, AY, AR) |>
group_by(team) |>
summarize(away_fouls = sum(AF), away_yellow_cards = sum(AY), away_red_cards = sum(AR))
# Combine the two groups into a single dataframe
all_cards <- left_join(home_cards, away_cards, by = "team")
# Calculate the total number of fouls, yellow cards, and red cards per game for each team
all_cards <- all_cards |>
mutate(total_fouls = home_fouls + away_fouls,
total_yellow_cards = home_yellow_cards + away_yellow_cards,
total_red_cards = home_red_cards + away_red_cards) |>
select(-home_fouls, -away_fouls, -home_yellow_cards, -away_yellow_cards, -home_red_cards, -away_red_cards)
# Calculate the number of games played by each team
home_games_played <- epl_data |>
group_by(HomeTeam) |>
summarize(h_games_played = n()) |>
rename(team = HomeTeam)
away_games_played <- epl_data |>
group_by(AwayTeam) |>
summarize(a_games_played = n()) |>
rename(team = AwayTeam)
# Combine the two groups into a single dataframe
games_played <- left_join(home_games_played, away_games_played, by = "team")
# Calculate the total number of games played by each team
games_played <- games_played |>
mutate(total_games_played = h_games_played + a_games_played) |>
select(-h_games_played, -a_games_played)
# Calculate the average number of fouls, yellow cards, and red cards per game for each team
all_cards <- left_join(all_cards, games_played, by = "team") |>
mutate(avg_fouls = total_fouls / total_games_played,
avg_yellow_cards = total_yellow_cards / total_games_played,
avg_red_cards = total_red_cards / total_games_played) |>
select(-total_fouls, -total_yellow_cards, -total_red_cards, -total_games_played)
# Separate the data into two groups: cards given to the Big 6 and cards given to the Bad 6
big_6_cards <- all_cards |>
filter(team %in% big_6)
bad_6_cards <- all_cards |>
filter(team %in% bad_6)
CI <- function(data, alpha) {
x_bar <- mean(data)
s <- sd(data)
n <- length(data)
q <- qnorm(1 - alpha / 2)
lower <- x_bar - q * s / sqrt(n)
upper <- x_bar + q * s / sqrt(n)
return(c(lower, upper))
}
# Determine if there is a significant difference between the number of fouls, yellow cards, and red cards booked against the Big 6 and Bad 6 teams
big_6_fouls <- mean(big_6_cards$avg_fouls)
bad_6_fouls <- mean(bad_6_cards$avg_fouls)
big_6_yellow_cards <- mean(big_6_cards$avg_yellow_cards)
bad_6_yellow_cards <- mean(bad_6_cards$avg_yellow_cards)
big_6_red_cards <- mean(big_6_cards$avg_red_cards)
bad_6_red_cards <- mean(bad_6_cards$avg_red_cards)
# Calculate the z-score for the difference in the average number of fouls, yellow cards, and red cards per game for the Big 6 and Bad 6
z_fouls <- (big_6_fouls - bad_6_fouls) / sqrt(var(big_6_cards$avg_fouls) / length(big_6_cards$avg_fouls) + var(bad_6_cards$avg_fouls) / length(bad_6_cards$avg_fouls))
z_yellow_cards <- (big_6_yellow_cards - bad_6_yellow_cards) / sqrt(var(big_6_cards$avg_yellow_cards) / length(big_6_cards$avg_yellow_cards) + var(bad_6_cards$avg_yellow_cards) / length(bad_6_cards$avg_yellow_cards))
z_red_cards <- (big_6_red_cards - bad_6_red_cards) / sqrt(var(big_6_cards$avg_red_cards) / length(big_6_cards$avg_red_cards) + var(bad_6_cards$avg_red_cards) / length(bad_6_cards$avg_red_cards))
# Calculate the p-value for the difference in the average number of fouls, yellow cards, and red cards per game for the Big 6 and Bad 6
p_fouls <- 2 * pnorm(-abs(z_fouls))
p_yellow_cards <- 2 * pnorm(-abs(z_yellow_cards))
p_red_cards <- 2 * pnorm(-abs(z_red_cards))
# Calculate the 95% confidence interval for the difference in the average number of fouls, yellow cards, and red cards per game for the Big 6 and Bad 6
big_6_bad_6_fouls_CI <- CI(big_6_cards$avg_fouls - bad_6_cards$avg_fouls, 0.05)
big_6_bad_6_yellow_cards_CI <- CI(big_6_cards$avg_yellow_cards - bad_6_cards$avg_yellow_cards, 0.05)
big_6_bad_6_red_cards_CI <- CI(big_6_cards$avg_red_cards - bad_6_cards$avg_red_cards, 0.05)
ggplot(epl_data%>% mutate(yellow=HY+AY, red=HR+AR,fouls=HF+AF), aes(x=fouls, y=yellow)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
